library(readr)
## Warning: package 'readr' was built under R version 4.2.2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ dplyr 1.0.10
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ purrr 0.3.5
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(tidytext)
library(textdata)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.2.2
## Loading required package: RColorBrewer
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.2.2
library(RColorBrewer)
library(syuzhet)
## Warning: package 'syuzhet' was built under R version 4.2.2
library(readr)
UFO_and_Weather <- read_csv("UFO_and_Weather.csv",
col_types = cols(month = col_character(),
hour = col_time(format = "%H")))
## New names:
## • `` -> `...1`
View(UFO_and_Weather)
#new eda goals
More text analysis (july 4th w and w/o)
General things with and without july 4th
Maybe the timing on july 4th?
For text analysis remove the node rows
See what day is most popular day of the week
First, lets make a word frequency
#Step 1:tokenize corpus
words <- UFO_and_Weather %>%
select(text) %>%
unnest_tokens(word, text)
head(words)
## # A tibble: 6 Ă— 1
## word
## <chr>
## 1 my
## 2 wife
## 3 was
## 4 driving
## 5 southeast
## 6 on
#Now, we'll generate a count of the words, sort by the number of times the word occurs, and then plot the top 15 words in a bar plot
#x= season and poem=line
words %>% count(word, sort = T) %>% slice(1:15) %>%
ggplot(aes(x = reorder(word, n, function(n) -n), y = n)) +
geom_bar(stat = 'identity') +
theme_light() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
xlab("Words") +
ggtitle(" Word Count (with stop words)")
#as we can see, the most popular words (at the moment) are stop words. This isn't very helpful for our analysis, so we'll take them out
now lets create stop words
#Step 2: Using the `TidyText` package, remove stop words and generate a new word count
ufo_no_stop <- words %>%
anti_join(stop_words)
## Joining, by = "word"
ufo_no_stop %>%
count(word, sort = T) %>%
slice(1:15) %>%
ggplot(aes(x = reorder(word, n, function(n) -n), y = n)) +
geom_bar(stat = "identity") +
theme_light() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
xlab("Words") +
ggtitle("Word Frequency without Stop Words")
We can see the most common words are those typical for a UFO report. Light, sky, object, moving, and looked all make sense here.
First making a term-document matrix. Following tutorial from here: http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know
Document matrix is a table containing the frequency of the words. Column names are words and row names are documents.
#Build a term-document matrix
dtm <- TermDocumentMatrix(ufo_no_stop)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## "light", "light", 24545
## "lights", "lights", 23581
## "sky", "sky", 23153
## "object", "object", 17931
## "bright", "bright", 13832
## "moving", "moving", 11887
## "white", "white", 9945
## "looked", "looked", 9287
## "red", "red", 8820
## "time", "time", 7510
#generate word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), scale=c(3.5,0.25))
#was getting error about word cloud cropping certain words, added the argument scale=c(3.5,0.25)
#playing with wordcloud2
wordcloud2(data=d, size = 0.5, shape = 'star')
wordcloud2(data=d, size=1.6, color='random-dark')
wordcloud2(d, color = "random-light", backgroundColor = "grey")
#making the word ufo with the word cloud
letterCloud(d, word = "UFO", wordSize = 2)
#ok lets try with the shape of a ufo now
# figPath = system.file("ufo.png",package = "wordcloud2")
#
# wordcloud2(d, figPath = figPath, size = 1.5,color = "skyblue")
#
#
# figPath = system.file("t.png",package = "wordcloud2")
# wordcloud2(d, figPath = figPath, size = 1.5,color = "red")
#
#
# # Change the shape using your image
# wordcloud2(d, figPath = "t.png", size = 1.5, color = "red", backgroundColor="gray")
#findFreqTerms(dtm, lowfreq = 4)
findAssocs(dtm, terms = "lights", corlimit = 0.3)
## $lights
## numeric(0)
#frequency table of words
head(d, 10)
## word freq
## "light", "light", 24545
## "lights", "lights", 23581
## "sky", "sky", 23153
## "object", "object", 17931
## "bright", "bright", 13832
## "moving", "moving", 11887
## "white", "white", 9945
## "looked", "looked", 9287
## "red", "red", 8820
## "time", "time", 7510
This gives us a quantitative measure of how much these words appear in the UFO summaries.
#making a df without the node observations
df2 <- UFO_and_Weather %>% filter(!grepl("MADAR", text))
words <- df2 %>%
select(text) %>%
unnest_tokens(word, text)
head(words)
## # A tibble: 6 Ă— 1
## word
## <chr>
## 1 my
## 2 wife
## 3 was
## 4 driving
## 5 southeast
## 6 on
#stop words
ufo_no_stop2 <- words %>%
anti_join(stop_words)
## Joining, by = "word"
#Build a term-document matrix
dtm1 <- TermDocumentMatrix(ufo_no_stop2)
m1 <- as.matrix(dtm1)
v1 <- sort(rowSums(m1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
head(d1, 10)
## word freq
## "light", "light", 24541
## "lights", "lights", 23580
## "sky", "sky", 23153
## "object", "object", 17928
## "bright", "bright", 13830
## "moving", "moving", 11885
## "white", "white", 9940
## "looked", "looked", 9287
## "red", "red", 8819
## "time", "time", 7506
#generate word cloud
set.seed(123)
wordcloud(words = d1$word, freq = d1$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), scale=c(3.5,0.25))
#playing with wordcloud2
wordcloud2(data=d1, size = 0.5, shape = 'star')
wordcloud2(data=d1, size=1.6, color='random-dark')
wordcloud2(d1, color = "random-light", backgroundColor = "grey")
#going to take out july 4th from all years to investigate how that influences the data set
UFO_no_july4 <- UFO_and_Weather
#first making date_time variable into just month and day
UFO_no_july4$date <- format(as.Date(UFO_no_july4$date_time), "%m-%d")
#now taking out july 4th
#UFO_no_july4 %>% filter(date != "07-04")
UFO_no_july4 %>% filter(!grepl("07-04", date))
## # A tibble: 22,123 Ă— 19
## ...1 city state date_time shape text city_…¹ city_…² year month
## <dbl> <chr> <chr> <dttm> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 0 Ches… VA 2019-12-12 18:43:00 light My w… 37.3 -77.4 2019 12
## 2 1 Rock… CT 2019-03-22 18:30:00 circ… I th… 41.7 -72.6 2019 3
## 3 2 Otta… ON 2019-04-17 02:00:00 tear… I wa… 45.4 -75.7 2019 4
## 4 3 Kirb… TX 2019-04-02 20:25:00 disk The … 30.7 -94.0 2019 4
## 5 4 Tucs… AZ 2019-05-01 11:00:00 unkn… Desc… 32.3 -111. 2019 5
## 6 5 Gold… AZ 2019-04-10 17:00:00 circ… Apr.… 33.4 -111. 2019 4
## 7 6 Broo… IN 2019-06-18 21:00:00 sphe… Meta… 39.4 -85.0 2019 6
## 8 7 Melb… FL 2019-06-12 22:00:00 unkn… We t… 28.0 -80.5 2019 6
## 9 8 Carr… NM 2019-06-11 22:00:00 chan… I wa… 33.8 -106. 2019 6
## 10 9 Waco TX 2018-06-15 01:00:00 circ… I wa… 31.6 -97.1 2018 6
## # … with 22,113 more rows, 9 more variables: day <dbl>, hour <time>,
## # temperature <dbl>, relative_humidity <dbl>, precipitation <dbl>,
## # snow <lgl>, wind_direction <dbl>, wind_speed <dbl>, date <chr>, and
## # abbreviated variable names ¹​city_latitude, ²​city_longitude
ok lets see how that worked
ggplot(UFO_no_july4, aes(x=reorder(month, month, FUN=length)))+
geom_bar()+
coord_flip()
just_july1 <- UFO_no_july4 %>%
filter(month== 7)
ggplot(just_july1, aes(x=reorder(day, day, FUN=length)))+
geom_bar()+
coord_flip()
to see the most popular day of the week, we need to manipulate the original date time variable
UFO_and_Weather$weekday <- weekdays(UFO_and_Weather$date_time)
Now, let’s plot by day of the week
#plotting
ggplot(UFO_and_Weather, aes(x=reorder(weekday, weekday, FUN=length)))+
geom_bar()+
labs(title="Sightings by day of week", x="day of week", y="count")+
coord_flip()
Weekends have the most sightings, which is interesting.